These two datasets are about “diagnosed diabetes among adults aged >=18 years” and “Obesity among adults aged >=18 years” in 2017 from the CDC. They include estimates for the 500 largest US cities and approximately 28,000 census tracts within these cities.
I used API method to obtain my datasets from CDC. First, you have to
create an account with password. Then, you have to apply for a free app
token. Last, copy your API Endpoint. Here are my datasets links:
https://chronicdata.cdc.gov/500-Cities-Places/500-Cities-Obesity-among-adults-aged-18-years/bjvu-3y7d
https://chronicdata.cdc.gov/500-Cities-Places/500-Cities-Diagnosed-diabetes-among-adults-aged-18/cn78-b9bj
dia <- read.socrata(
"https://chronicdata.cdc.gov/resource/cn78-b9bj.json?year=2017",
app_token = "bEkVW73ASzmTkZ9riAtf2YS5c",
email = "clu74108@usc.edu",
password = "Samuelsunny0325!"
)
dia <- as.data.table(dia)
write.csv(dia,"dia.csv", row.names = F)
obe <- read.socrata(
"https://chronicdata.cdc.gov/resource/bjvu-3y7d.json?year=2017",
app_token = "bEkVW73ASzmTkZ9riAtf2YS5c",
email = "clu74108@usc.edu",
password = "Samuelsunny0325!"
)
obe <- as.data.table(obe)
write.csv(obe,"obe.csv", row.names = F)
Both datasets contain 27 columns and 29,006 rows.
I select data_value, populationcount, stateabbr, statedesc(state name), city_name, geolocation.latitude, and geolocation.longitude total 7 columns.
dia_mini <- dia[, c(2, 3, 14, 17, 19, 20, 23)]
obe_mini <- obe[, c(2, 3, 14, 17, 19, 20, 23)]
Then I change my column names in order to easily understand.
colnames(dia_mini)[1] <- "diabetes_percentage"
colnames(dia_mini)[2] <- "populationCount"
colnames(dia_mini)[3] <- "state_abbr"
colnames(dia_mini)[4] <- "state_name"
colnames(dia_mini)[5] <- "city_name"
colnames(dia_mini)[6] <- "lat"
colnames(dia_mini)[7] <- "lon"
colnames(obe_mini)[1] <- "obesity_percentage"
colnames(obe_mini)[2] <- "populationCount"
colnames(obe_mini)[3] <- "state_abbr"
colnames(obe_mini)[4] <- "state_name"
colnames(obe_mini)[5] <- "city_name"
colnames(obe_mini)[6] <- "lat"
colnames(obe_mini)[7] <- "lon"
Merge two datasets by state_abbr, populationCount, state_name, city_name, lat, and lon.
merged <-
merge(
# Data
x = dia_mini,
y = obe_mini,
# List of variables to match
by = c("state_abbr","populationCount", "state_name", "city_name", "lat", "lon"),
# keep everything!
all.x = TRUE
)
dim(merged)
## [1] 30008 8
My row number increased to 30,008 so I have to remove duplicates.
merged[, n := 1:.N, by = .(state_abbr, state_name, city_name, lat, lon)]
merged <- merged[n == 1,][, n := NULL]
length(unique(merged$lat))
## [1] 28505
After removing duplicates, my rows shrink from 30,008 to 28,505.
merged$lat <- as.numeric(merged$lat)
merged$lon <- as.numeric(merged$lon)
merged$diabetes_percentage <- as.numeric(merged$diabetes_percentage)
merged$populationCount <- as.numeric(merged$populationCount)
merged$obesity_percentage <- as.numeric(merged$obesity_percentage)
mean(is.na(merged$diabetes_percentage))
## [1] 0.02785476
mean(is.na(merged$obesity_percentage))
## [1] 0.02785476
There are only 2.7% NAs in my dataset which are not significant. Therefore, I’m going to replace NA values with mean.
merged[, diabetes_percentage := fcoalesce(diabetes_percentage, mean(diabetes_percentage, na.rm = TRUE))]
merged[, obesity_percentage := fcoalesce(obesity_percentage, mean(obesity_percentage, na.rm = TRUE))]
I create a new column contain Northeast, Northwest, Southwest, and Southeast four different regions
# Add regions
merged[, region := fifelse(lon >= -98 & lat > 39.71, "NE",
fifelse(lon < -98 & lat > 39.71, "NW",
fifelse(lon < -98 & lat <= 39.71, "SW","SE")))
]
#table(merged$region)
By using scatter plots, I can see whether there is a correlation between obesity percentage and diabetes percentage.
#All states
p1_scatter <- merged %>%
plot_ly(x = ~obesity_percentage, y= ~diabetes_percentage,
type = 'scatter', mode = 'markers', color = ~state_abbr,
hoverinfo = 'text',
text = ~paste( paste(state_name, ":", sep=""),
paste(city_name, ":", sep=""),
paste(" Obesity percentage: ", obesity_percentage, sep=""),
paste(" Diabetes percentage: ", diabetes_percentage, sep=""),
sep = "<br>")) %>%
layout(title = "Obesity percentage vs. Diabetes percentage with all cities",
xaxis = list(title = "Obesity percentage"),
yaxis = list(title = "Diabetes percentage"),
hovermode = "compare")
#Different regions
p2_scatter <- merged[!is.na(region)] %>%
plot_ly(x = ~obesity_percentage, y= ~diabetes_percentage,
type = 'scatter', mode = 'markers', color = ~region,
hoverinfo = 'text',
text = ~paste( paste(state_name, ":", sep=""),
paste(city_name, ":", sep=""),
paste(region, ":", sep=""),
paste(" Obesity percentage: ", obesity_percentage, sep=""),
paste(" Diabetes percentage: ", diabetes_percentage, sep=""),
sep = "<br>")) %>%
layout(title = "Obesity percentage vs. Diabetes percentage in different regions",
xaxis = list(title = "Obesity percentage"),
yaxis = list(title = "Diabetes percentage"),
hovermode = "compare")
#Cities in CA
p3_scatter <- merged[state_abbr == "CA"] %>%
plot_ly(x = ~obesity_percentage, y= ~diabetes_percentage,
type = 'scatter', mode = 'markers', color = ~city_name,
hoverinfo = 'text',
text = ~paste( paste(state_name, ":", sep=""),
paste(city_name, ":", sep=""),
paste(" Obesity percentage: ", obesity_percentage, sep=""),
paste(" Diabetes percentage: ", diabetes_percentage, sep=""),
sep = "<br>")) %>%
layout(title = "Obesity percentage vs. Diabetes percentage in CA",
xaxis = list(title = "Obesity percentage"),
yaxis = list(title = "Diabetes percentage"),
hovermode = "compare")
From the scatter plot with all states in the US, we can see that there is a positive correlation between obesity and diabetes rates in different states and so as in different regions.
First, create a color palette so we can see the severity of each place.
pal_dia <- colorNumeric(c('darkblue','goldenrod','darkred'), domain=merged$diabetes_percentage)
# Diabetes percentage in the US
p1_leaflet <- leaflet() %>%
addProviderTiles('OpenStreetMap') %>%
addCircles(data = merged,
lat=~lat,lng=~lon,
label = ~paste0(round(diabetes_percentage,2)), color = ~ pal_dia(diabetes_percentage),
opacity = 0.5, fillOpacity = 1, radius = 50) %>%
# Legend
addLegend('bottomleft', pal=pal_dia, values=merged$diabetes_percentage,
title='Diabetes percentage', opacity=1)
# Diabetes percentage in LA
p2_leaflet <- leaflet() %>%
addProviderTiles('OpenStreetMap') %>%
addCircles(data = merged[merged$city_name == "Los Angeles"],
lat=~lat,lng=~lon,
label = ~paste0(round(diabetes_percentage,2)), color = ~ pal_dia(diabetes_percentage),
opacity = 0.5, fillOpacity = 1, radius = 50) %>%
# Legend
addLegend('bottomleft', pal=pal_dia, values=merged$diabetes_percentage,
title='Diabetes percentage', opacity=1)
pal_obe <- colorNumeric(c('darkblue','goldenrod','darkred'), domain=merged$obesity_percentage)
# Obesity percentage in LA
p3_leaflet <- leaflet() %>%
addProviderTiles('OpenStreetMap') %>%
addCircles(data = merged[merged$city_name == "Los Angeles"],
lat=~lat,lng=~lon,
label = ~paste0(round(obesity_percentage,2)), color = ~ pal_obe(obesity_percentage),
opacity = 0.5, fillOpacity = 1, radius = 50) %>%
# And a pretty legend
addLegend('bottomleft', pal=pal_obe, values=merged$obesity_percentage,
title='Obesity percentage', opacity=1)
At my first glance, I see there are more orange dots in the NE region. From the second plot, dots closer to Downtown LA have higher rates of diabetes in orange color. Similar to the second plot, dots near downtown LA get more orange color than other places.
Show median, max, min, and length of diabetes_percentage and obesity_percentage columns in different cities
# Show the top 5 highest median of diabetes percentage cities in the US
diabetes_median_hi <- merged[, .(
diabetes_median = median(diabetes_percentage, na.rm = T),
diabetes_max = max(diabetes_percentage, na.rm = T),
diabetes_min = min(diabetes_percentage, na.rm = T),
diabetes_length = length(diabetes_percentage)
), by=c("city_name", "state_name", "region")][order(-diabetes_median)]
knitr::kable(diabetes_median_hi[1:5,], caption = "Top 5 highest median of diabetes percentage cities in the US")
| city_name | state_name | region | diabetes_median | diabetes_max | diabetes_min | diabetes_length |
|---|---|---|---|---|---|---|
| Gary | Indiana | NE | 24.05 | 29.0 | 16.50000 | 32 |
| Detroit | Michigan | NE | 18.90 | 29.3 | 5.70000 | 297 |
| Youngstown | Ohio | NE | 18.70 | 27.2 | 10.80465 | 33 |
| Camden | New Jersey | NE | 18.15 | 22.2 | 14.50000 | 20 |
| Brownsville | Texas | SE | 18.05 | 25.2 | 10.80465 | 52 |
# Show the top 5 highest median of obesity percentage cities in the US
obesity_median_hi <- merged[, .(
obesity_median = median(obesity_percentage, na.rm = T),
obesity_max = max(obesity_percentage, na.rm = T),
obesity_min = min(obesity_percentage, na.rm = T),
obesity_length = length(obesity_percentage)
), by=c("city_name", "state_name", "region")][order(-obesity_median)]
knitr::kable(obesity_median_hi[1:5,], caption = "Top 5 highest median of obesity percentage cities in the US")
| city_name | state_name | region | obesity_median | obesity_max | obesity_min | obesity_length |
|---|---|---|---|---|---|---|
| Gary | Indiana | NE | 50.3 | 54.7 | 40.50000 | 32 |
| Flint | Michigan | NE | 49.7 | 55.4 | 30.46873 | 41 |
| Youngstown | Ohio | NE | 46.1 | 53.7 | 30.46873 | 33 |
| Pharr | Texas | SW | 45.6 | 48.3 | 39.90000 | 10 |
| Detroit | Michigan | NE | 45.5 | 55.3 | 27.90000 | 297 |
# Show the top 5 lowest median of diabetes percentage cities in the US
diabetes_median_lo <- merged[, .(
diabetes_median = median(diabetes_percentage, na.rm = T),
diabetes_max = max(diabetes_percentage, na.rm = T),
diabetes_min = min(diabetes_percentage, na.rm = T),
diabetes_length = length(diabetes_percentage)
), by=c("city_name", "state_name", "region")][order(diabetes_median)]
knitr::kable(diabetes_median_lo[1:5,], caption = "Top 5 lowest median of diabetes percentage cities in the US")
| city_name | state_name | region | diabetes_median | diabetes_max | diabetes_min | diabetes_length |
|---|---|---|---|---|---|---|
| Boulder | Colorado | NW | 5.0 | 6.7 | 1.4 | 29 |
| Fort Collins | Colorado | NW | 5.1 | 14.6 | 1.6 | 48 |
| Somerville | Massachusetts | NE | 5.4 | 8.2 | 2.5 | 19 |
| College Station | Texas | SE | 5.4 | 10.0 | 1.7 | 27 |
| Cambridge | Massachusetts | NE | 5.7 | 8.8 | 1.7 | 33 |
# Show the top 5 lowest median of obesity percentage cities in the US
obesity_median_lo <- merged[, .(
obesity_median = median(obesity_percentage, na.rm = T),
obesity_max = max(obesity_percentage, na.rm = T),
obesity_min = min(obesity_percentage, na.rm = T),
obesity_length = length(obesity_percentage)
), by=c("city_name", "state_name", "region")][order(obesity_median)]
knitr::kable(obesity_median_lo[1:5,], caption = "Top 5 lowest median of diabetes percentage cities in the US")
| city_name | state_name | region | obesity_median | obesity_max | obesity_min | obesity_length |
|---|---|---|---|---|---|---|
| Boulder | Colorado | NW | 15.70 | 18.20000 | 11.7 | 29 |
| Milpitas | California | SW | 15.75 | 30.46873 | 12.5 | 18 |
| Irvine | California | SW | 16.40 | 30.46873 | 14.5 | 38 |
| Sunnyvale | California | SW | 16.65 | 30.46873 | 14.1 | 32 |
| Fremont | California | SW | 16.95 | 21.70000 | 11.4 | 44 |
(thoughts?)
p1_box <- merged[!is.na(diabetes_percentage)][!is.na(region)] %>%
plot_ly(x = ~region, y= ~diabetes_percentage,
type = 'box', mode = 'markers', color = ~region,
hoverinfo = 'text',
text = ~paste( paste(state_name, "-", sep=""),
paste(region, "-", sep=""),
paste(city_name, "-", sep=""),
paste(" Diabetes percentage: ", diabetes_percentage, sep=""),
sep = "<br>")) %>%
layout(title = "Diabetes percentage in different regions",
xaxis = list(title = "Regions"),
yaxis = list(title = "Diabetes percentage"),
hovermode = "compare")
p2_box <- merged[!is.na(obesity_percentage)][!is.na(region)] %>%
plot_ly(x = ~region, y= ~obesity_percentage,
type = 'box', mode = 'markers', color = ~region,
hoverinfo = 'text',
text = ~paste( paste(state_name, "-", sep=""),
paste(region, "-", sep=""),
paste(city_name, "-", sep=""),
paste(" Obesity percentage: ", obesity_percentage, sep=""),
sep = "<br>")) %>%
layout(title = "Obesity percentage in different regions",
xaxis = list(title = "Regions"),
yaxis = list(title = "Obesity percentage"),
hovermode = "compare")
There is a highest diabetes_percentage almost 40% in NE region.
In NE region, there is a max obesity_percentage occurred. However, SE region has the highest median of obesity_percentage.
Among all cities, Gary, in NE region, has both the highest mean of diabetes_percentage(24.05%) and obesity_percentage(50.30%). On the other hand, Boulder, in NW region, has both the lowest mean of diabetes_percentage(5.00%) and obesity_percentage(15.70%).
And from the boxplot, NW region has the lowest median in diabetes_percentage and obesity_percentage.
Among all cities, Gary, in NE region, has the highest median rates of diabetes(24.05%) and he highest median rates of obesity(50.30%). On the other hand, Boulder, in NW region, has the lowest median rates of diabetes(5.00%) and the lowest median rates of obesity(15.70%).
We can see that there is a positive correlation between obesity and diabetes rates from the scatter plot by states, so as the scatter plot by regions. Apart from the result, we can also see that the data points in the Northwest are fewer than in other regions.
From the leaflet, cities closer to Downtown LA have higher rates of diabetes in orange color. Similar to the result of diabetes percentage, the region near downtown LA shows orange color dots as well.
From histograms, the NE region has higher counts of diabetes_percentage. We can also see that the NE region has higher counts of obesity_percentage as well. However, all results from histograms might be affected by different numbers and sizes of cities in different regions.
There is the highest diabetes_percentage in the NE region(almost 40%). However, the SE region has the highest median of obesity_percentage. The NW region has both the lowest median in diabetes_percentage and obesity_percentage.
Compare diabetes_percentage in different regions.
p1_his <- merged[!is.na(diabetes_percentage)][!is.na(region)] %>%
plot_ly(x= ~diabetes_percentage,
type = 'histogram', mode = 'markers', color= ~region,
hoverinfo = 'text',
text = ~paste( paste(state_name, ":", sep=""),
paste(city_name, ":", sep=""),
paste(" Diabetes percentage: ", diabetes_percentage, sep=""),
sep = "<br>")) %>%
layout(title = "Diabetes percentage in different regions",
xaxis = list(title = "Diabetes percentage"),
yaxis = list(title = "counts"),
hovermode = "compare")
From this histogram, we can see that NE region has higher counts of diabetes_percentage. However, it might be affected by different number and size of cities in different regions.
p2_his <- merged[!is.na(obesity_percentage)][!is.na(region)] %>%
plot_ly(x= ~obesity_percentage,
type = 'histogram', mode = 'markers', color= ~region,
hoverinfo = 'text',
text = ~paste( paste(state_name, ":", sep=""),
paste(city_name, ":", sep=""),
paste(" Obesity percentage: ", diabetes_percentage, sep=""),
sep = "<br>")) %>%
layout(title = "Obesity percentage in different regions",
xaxis = list(title = "Obesity percentage"),
yaxis = list(title = "counts"),
hovermode = "compare")
we can see that NE region has higher counts of obesity_percentage as well. However, it might be also affected by different number and size of cities in different regions.
Copyright © 2020, Sam Lu.